library(tidyverse)
library(geomtextpath)
library(plotly)
library(htmlwidgets)
library(maps)
library(dplyr)
df <- read.csv("SBAnational.csv")
naics_mapping <- list(
"11" = "Agriculture, Forestry, Fishing and Hunting",
"21" = "Mining, Quarrying, and Oil and Gas Extraction",
"22" = "Utilities",
"23" = "Construction",
"31" = "Manufacturing",
"32" = "Manufacturing",
"33" = "Manufacturing",
"42" = "Wholesale Trade",
"44" = "Retail Trade",
"45" = "Retail Trade",
"48" = "Transportation and Warehousing",
"49" = "Transportation and Warehousing",
"51" = "Information",
"52" = "Finance and Insurance",
"53" = "Real Estate and Rental and Leasing",
"54" = "Professional, Scientific, and Technical Services",
"55" = "Management of Companies and Enterprises",
"56" = "Administrative and Support Services",
"61" = "Educational Services",
"62" = "Health Care and Social Assistance",
"71" = "Arts, Entertainment, and Recreation",
"72" = "Accommodation and Food Services",
"81" = "Other Services",
"92" = "Public Administration"
)
df <- df %>%
filter(df$MIS_Status != "", df$GrAppv != "", df$SBA_Appv != "",
df$NAICS != 0, df$State != "", df$NoEmp <= 1500,
df$LowDoc == "Y" | df$LowDoc == "N", df$Term > 0,
df$ApprovalFY < 2011 & df$ApprovalFY > 1999, df$NewExist != 0,
df$RevLineCr == "Y" | df$RevLineCr == "N", df$UrbanRural > 0)
df$GrAppv <- as.numeric(gsub("[$,]", "", df$GrAppv))
df$SBA_Appv <- as.numeric(gsub("[$,]", "", df$SBA_Appv))
df$DisbursementGross <- as.numeric(gsub("[$,]", "", df$DisbursementGross))
df$ApprovalFY <- as.numeric(df$ApprovalFY)
df <- df %>%
mutate(RealEstate = ifelse(df$Term >= 240, 1, 0)) %>%
mutate(MIS_Status = ifelse(df$MIS_Status == "CHGOFF", 1, 0)) %>%
mutate(FranchiseCode = ifelse(df$FranchiseCode > 1, 1, 0)) %>%
mutate(UrbanRural = ifelse(df$UrbanRural == 1, 1, 0)) %>%
mutate(NewExist = ifelse(df$NewExist == 1, 1, 0)) %>%
mutate(RevLineCr = ifelse(df$RevLineCr == "Y", 1, 0)) %>%
mutate(LowDoc = ifelse(df$LowDoc == "Y", 1, 0)) %>%
mutate(NAICS = as.integer(NAICS / 10000)) %>%
mutate(Industry = as.character(naics_mapping[as.character(NAICS)])) %>%
mutate(Proportion = SBA_Appv / GrAppv) %>%
mutate(Recession = ifelse(df$ApprovalFY > 2007 & df$ApprovalFY <= 2009,
1, 0))
df <- df %>%
select(State, Industry, FranchiseCode, UrbanRural,
Term, NoEmp, NewExist, RevLineCr, Recession,
RealEstate, DisbursementGross, LowDoc, NoEmp, Proportion, MIS_Status)
Determine whether or not data that occurred in the recession is skewed enough to warrant excluding it from the data set. Consider p-value < 0.05 significant
recession <- df %>%
select(MIS_Status, Recession)
chisq.test(table(recession))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(recession)
## X-squared = 2055.9, df = 1, p-value < 2.2e-16
recession_summary <- recession %>%
group_by(Recession, MIS_Status) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(Recession) %>%
mutate(percentage = count / sum(count)) %>%
ungroup()
recession_plot <- ggplotly(
ggplot(recession_summary, aes(
x = factor(Recession),
y = percentage,
fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
text = paste(
"Loan Occured during the Recession: ", ifelse(Recession == 1, "Yes",
"No"),
"<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
"Paid In Full"),
"<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
)
)) +
geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
scale_y_continuous(labels = scales::percent) +
scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
labs(
title = "Loan Occured during the Recession vs Default Rate",
x = "Loan Occured during the Recession",
y = "Percentage",
fill = "Default Status"
),
tooltip = "text"
)
recession_plot
With a p-value <0.05 and there being a significant difference in default rate between loans that occurred during the Great Recession and loans that didn’t occur during the Great Recession we will exclude the data from the Great Recession
df <- df %>%
filter(df$Recession != 1)
df <- df %>% select(-Recession)
state_mapping <- setNames(state.name, state.abb)
state_test <- df %>%
select(State, MIS_Status)
chisq.test(table(state_test))
##
## Pearson's Chi-squared test
##
## data: table(state_test)
## X-squared = 5696.8, df = 50, p-value < 2.2e-16
state <- df %>%
mutate(StateName = tolower(state_mapping[State])) %>%
group_by(StateName) %>%
summarise(default_rate = mean(MIS_Status == 1))
map <- map_data("state")
map <- left_join(map, state, by = c("region" = "StateName"))
state_plot <- ggplotly(
ggplot(map, aes(x = long, y = lat, group = group, fill = default_rate,
text = paste(
"State: ", tools::toTitleCase(region),
"<br>Default Rate: ", scales::percent(default_rate, accuracy = 0.01)
)
)) +
geom_polygon(color = "white") +
scale_fill_gradient(low = "lightblue", high = "red",
labels = scales::percent) +
labs(
title = "Loan Default Rate by State",
fill = "Default Rate (%)"
),
tooltip = "text"
)
state_plot
industry <- df %>%
select(MIS_Status, Industry)
chisq.test(table(industry))
##
## Pearson's Chi-squared test
##
## data: table(industry)
## X-squared = 3347.4, df = 19, p-value < 2.2e-16
Create two summaries, one that includes both default and paid in full rates for graphing, one that includes only default rate for later use.
industry_summary <- industry %>%
group_by(Industry, MIS_Status) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(Industry) %>%
mutate(percentage = count / sum(count)) %>%
ungroup()
industry_summary2 <- industry %>%
group_by(Industry) %>%
summarise(default_rate = mean(MIS_Status == 1))
industry_plot <- ggplotly(
ggplot(industry_summary, aes(
x = factor(Industry),
y = percentage,
fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
text = paste(
"Industry: ", Industry,
"<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
"Paid In Full"),
"<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
)
)) +
geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
scale_y_continuous(labels = scales::percent) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = "Industry vs Default Rate",
x = "Industry",
y = "Percentage",
fill = "Default Status"
),
tooltip = "text"
)
industry_plot
real_estate <- df %>%
select(MIS_Status, RealEstate)
chisq.test(table(real_estate))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(real_estate)
## X-squared = 6830.5, df = 1, p-value < 2.2e-16
real_estate_summary <- real_estate %>%
group_by(RealEstate, MIS_Status) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(RealEstate) %>%
mutate(percentage = count / sum(count)) %>%
ungroup()
real_estate_plot <- ggplotly(
ggplot(real_estate_summary, aes(
x = factor(RealEstate),
y = percentage,
fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
text = paste(
"Backed by Real Estate: ", ifelse(RealEstate == 1, "Yes", "No"),
"<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
"Paid In Full"),
"<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
)
)) +
geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
scale_y_continuous(labels = scales::percent) +
scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
labs(
title = "Real Estate vs Default Rate",
x = "Backed By Real Estate",
y = "Percentage",
fill = "Default Status"
),
tooltip = "text"
)
real_estate_plot
franchise <- df %>%
select(MIS_Status, FranchiseCode)
chisq.test(table(franchise))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(franchise)
## X-squared = 257.88, df = 1, p-value < 2.2e-16
franchise_summary <- franchise %>%
group_by(FranchiseCode, MIS_Status) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(FranchiseCode) %>%
mutate(percentage = count / sum(count)) %>%
ungroup()
franchise_plot <- ggplotly(
ggplot(franchise_summary, aes(
x = factor(FranchiseCode),
y = percentage,
fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
text = paste(
"Is Franchise: ", ifelse(FranchiseCode == 1, "Yes", "No"),
"<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
"Paid In Full"),
"<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
)
)) +
geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
scale_y_continuous(labels = scales::percent) +
scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
labs(
title = "Is Franchise vs Default Rate",
x = "Is Franchise",
y = "Percentage",
fill = "Default Status"
),
tooltip = "text"
)
franchise_plot
new <- df %>%
select(MIS_Status, NewExist)
chisq.test(table(new))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(new)
## X-squared = 27.349, df = 1, p-value = 1.698e-07
new_summary <- new %>%
group_by(NewExist, MIS_Status) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(NewExist) %>%
mutate(percentage = count / sum(count)) %>%
ungroup()
new_plot <- ggplotly(
ggplot(new_summary, aes(
x = factor(NewExist),
y = percentage,
fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
text = paste(
"Existed for more than 2 years: ", ifelse(NewExist == 1, "Yes", "No"),
"<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
"Paid In Full"),
"<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
)
)) +
geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
scale_y_continuous(labels = scales::percent) +
scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
labs(
title = "Is Existing vs Default Rate",
x = "Is Existing",
y = "Percentage",
fill = "Default Status"
),
tooltip = "text"
)
new_plot
credit <- df %>%
select(MIS_Status, RevLineCr)
chisq.test(table(credit))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(credit)
## X-squared = 68.8, df = 1, p-value < 2.2e-16
credit_summary <- credit %>%
group_by(RevLineCr, MIS_Status) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(RevLineCr) %>%
mutate(percentage = count / sum(count)) %>%
ungroup()
credit_plot <- ggplotly(
ggplot(credit_summary, aes(
x = factor(RevLineCr),
y = percentage,
fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
text = paste(
"Has Revolving Line of Credit: ", ifelse(RevLineCr == 1, "Yes", "No"),
"<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
"Paid In Full"),
"<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
)
)) +
geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
scale_y_continuous(labels = scales::percent) +
scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
labs(
title = "Has Revolving Line of Credit vs Default Rate",
x = "Has Revolving Line of Credit",
y = "Percentage",
fill = "Default Status"
),
tooltip = "text"
)
credit_plot
urban <- df %>%
select(MIS_Status, UrbanRural)
chisq.test(table(urban))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(urban)
## X-squared = 771.6, df = 1, p-value < 2.2e-16
urban_summary <- urban %>%
group_by(UrbanRural, MIS_Status) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(UrbanRural) %>%
mutate(percentage = count / sum(count)) %>%
ungroup()
urban_plot <- ggplotly(
ggplot(urban_summary, aes(
x = factor(UrbanRural),
y = percentage,
fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
text = paste(
"Urban or Rural: ", ifelse(UrbanRural == 1, "Urban", "Rural"),
"<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
"Paid In Full"),
"<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
)
)) +
geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
scale_y_continuous(labels = scales::percent) +
scale_x_discrete(labels = c("0" = "Rural", "1" = "Urban")) +
labs(
title = "Ubran/Rural vs Default Rate",
x = "Ubran/Rural",
y = "Percentage",
fill = "Default Status"
),
tooltip = "text"
)
urban_plot
lowdoc <- df %>%
select(MIS_Status, LowDoc)
chisq.test(table(lowdoc))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(lowdoc)
## X-squared = 59.741, df = 1, p-value = 1.082e-14
lowdoc_summary <- lowdoc %>%
group_by(LowDoc, MIS_Status) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(LowDoc) %>%
mutate(percentage = count / sum(count)) %>%
ungroup()
lowdoc_plot <- ggplotly(
ggplot(lowdoc_summary, aes(
x = factor(LowDoc),
y = percentage,
fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
text = paste(
"In LowDoc program: ", ifelse(LowDoc == 1, "Yes", "No"),
"<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
"Paid In Full"),
"<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
)
)) +
geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
scale_y_continuous(labels = scales::percent) +
scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
labs(
title = "In LowDoc Program vs Default Rate",
x = "In LowDoc Programt",
y = "Percentage",
fill = "Default Status"
),
tooltip = "text"
)
lowdoc_plot
sba_prop <- df %>%
select(MIS_Status, Proportion)
t.test(Proportion ~ MIS_Status, data = sba_prop, alternative = "two.sided")
##
## Welch Two Sample t-test
##
## data: Proportion by MIS_Status
## t = 53.883, df = 163065, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.03112824 0.03347829
## sample estimates:
## mean in group 0 mean in group 1
## 0.5972755 0.5649723
sba_prop_plot <- ggplotly(
ggplot(sba_prop, aes(
x = Proportion,
fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
text = paste(
"Term: ", after_stat(xmin), " to ", after_stat(xmax),
"<br>Count: ", after_stat(count)
))) +
geom_histogram(
position = "dodge",
binwidth = 0.25,
boundary = 0,
alpha = 0.7) +
xlim(0, max(sba_prop$Proportion)) +
facet_wrap(~ MIS_Status,
scales = "free_y",
labeller = as_labeller(c("0" = "Paid In Full",
"1" = "Defaulted"))) +
labs(
title = "Histogram of loan portion gauranteed by MIS Status",
x = "Porportion of loan gauranteed by SBA",
y = "Count",
fill = "Default Status"
),
tooltip = "text"
)
sba_prop_plot
Find quartile for data to make a barchart that displays the distribution of gross disbursement by quartile
disbur <- df %>%
mutate(
Quartile = cut(
DisbursementGross,
breaks = quantile(DisbursementGross, probs = seq(0, 1, by = 0.25)),
include.lowest = TRUE,
labels = c("0-25%", "25-50%", "50-75%", "75-100%")
)
) %>%
select(MIS_Status, DisbursementGross, Quartile)
t.test(DisbursementGross ~ MIS_Status, data = disbur,
alternative = "two.sided")
##
## Welch Two Sample t-test
##
## data: DisbursementGross by MIS_Status
## t = 77.279, df = 215958, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 64680.67 68046.94
## sample estimates:
## mean in group 0 mean in group 1
## 168989.3 102625.5
disbur_prop <- disbur %>%
group_by(MIS_Status, Quartile) %>%
summarise(Count = n()) %>%
group_by(MIS_Status) %>%
mutate(Proportion = Count / sum(Count))
## `summarise()` has grouped output by 'MIS_Status'. You can override using the
## `.groups` argument.
disbur_plot <- ggplotly(
ggplot(disbur_prop, aes(x = Quartile, y = Proportion, fill = Quartile,
text = paste(
"Quartile: ", Quartile,
"<br>Percentage: ", scales::percent(Proportion, accuracy = 0.01)
))) +
geom_bar(stat = "identity", alpha = 0.75) +
facet_wrap(~ MIS_Status, labeller =
as_labeller(c(`0` = "Paid In Full", `1` = "Default"))) +
scale_y_continuous(labels = scales::percent) +
labs(
title = "Gross Disbursement Distribution by Quartiles",
x = "Gross Disbursement Quartile",
y = "Proportion",
fill = "Quartile"
) +
theme_minimal() +
theme(
legend.position = "bottom",
strip.text = element_text(face = "bold"),
plot.title = element_text(hjust = 0.5)
),
tooltip = "text"
)
disbur_plot
Do same process as done with the gross disbursement
emp <- df %>%
mutate(
Quartile = cut(
NoEmp,
breaks = quantile(NoEmp, probs = seq(0, 1, by = 0.25)),
include.lowest = TRUE,
labels = c("0-25%", "25-50%", "50-75%", "75-100%")
)
) %>%
select(MIS_Status, NoEmp, Quartile)
t.test(NoEmp ~ MIS_Status, data = emp,
alternative = "two.sided")
##
## Welch Two Sample t-test
##
## data: NoEmp by MIS_Status
## t = 51.518, df = 236308, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 2.899522 3.128870
## sample estimates:
## mean in group 0 mean in group 1
## 8.684455 5.670259
emp_prop <- emp %>%
group_by(MIS_Status, Quartile) %>%
summarise(Count = n()) %>%
group_by(MIS_Status) %>%
mutate(Proportion = Count / sum(Count))
## `summarise()` has grouped output by 'MIS_Status'. You can override using the
## `.groups` argument.
emp_plot <- ggplotly(
ggplot(emp_prop, aes(x = Quartile, y = Proportion, fill = Quartile,
text = paste (
"Quartile: ", Quartile,
"<br>Percentage: ", scales::percent(Proportion, accuracy = 0.01)
))) +
geom_bar(stat = "identity", alpha = 0.75) +
facet_wrap(~ MIS_Status, labeller =
as_labeller(c(`0` = "Paid In Full", `1` = "Default"))) +
scale_y_continuous(labels = scales::percent) +
labs(
title = "Number of employee Distribution by Quartiles",
x = "Disbursement Quartile",
y = "Proportion",
fill = "Quartile"
) +
theme_minimal() +
theme(
legend.position = "bottom",
strip.text = element_text(face = "bold"),
plot.title = element_text(hjust = 0.5)
),
tooltip = "text"
)
emp_plot
term <- df %>%
select(MIS_Status, Term)
t.test(Term ~ MIS_Status, data = term, alternative = "two.sided")
##
## Welch Two Sample t-test
##
## data: Term by MIS_Status
## t = 272.38, df = 233311, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 48.46045 49.16292
## sample estimates:
## mean in group 0 mean in group 1
## 94.92662 46.11493
Filter few data points to make graph more balanced
term <- term %>%
filter(Term <= 300)
term_plot <- ggplotly(
ggplot(term, aes(
x = Term,
fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
text = paste(
"Term: ", after_stat(xmin), " to ", after_stat(xmax),
"<br>Count: ", after_stat(count)
))) +
geom_histogram(
position = "dodge",
binwidth = 25,
boundary = 0,
alpha = 0.7) +
xlim(0, max(term$Term)) +
facet_wrap(~ MIS_Status,
scales = "free_y",
labeller = as_labeller(c("0" = "Paid In Full",
"1" = "Defaulted"))) +
labs(
title = "Histogram of Term by MIS Status",
x = "Term",
y = "Count",
fill = "Default Status"
),
tooltip = "text"
)
term_plot